#Importing libraries
import pandas as pd
#from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import LogisticRegression
# importing ploting libraries
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
# calculate accuracy measures and confusion matrix
from sklearn import metrics
bank_df = pd.read_csv("bank-full.csv")
#printing top 10 rows
#bank_df.head(50)
#size of data
bank_df.shape
bank_df.dtypes
$\color{blue}{\text{ Here age,day,dration,campaign,pdays,previous are of int type using describe functions will give values for these columns only}}$
#Exporting to excel file for analysis
bank_df.describe().transpose().to_excel("out.xlsx", sheet_name='bank_describe')
#Displaying the descriptive statistical values
bank_df.describe().transpose()
#checking the value campaign greater than 50
bank_df[bank_df['campaign'] > 50] # count greater than 50 contacted :4
###bank_df[bank_df['campaign'] < 5] #count less than 5 times contacted :39092
rowCount= 0
for row in bank_df[bank_df['pdays'] > 500]:
rowCount = rowCount+1
print(str(rowCount))
There are 17 rows with pdays>500
$\color{red}{\text{Descriptive STATISTICS results :}}$
|  | mean | min | 25% | 50% | 75% | max | comments | domain understanding |
|---|---|---|---|---|---|---|---|---|
| age | 40.93621021 | 18 | 33 | 39 | 48 | 95 | mean is slightly greater than the median which will result in right skwed curve the diffrence between the Q3 and max value is 46 there is a highly possibility of outlier | do age range accpected from (18-100) is 95 can be an outlier |
| balance | 1362.272058 | -8019 | 72 | 448 | 1428 | 102127 | mean is again thrice than the median curve is right skewed and the diffrence between Q3 and max is very high expected outliers in this variable | |
| day | 15.80641879 | 1 | 8 | 16 | 21 | 31 | last contacted day of week makes an impact need to check | |
| duration | 258.1630798 | 0 | 103 | 180 | 319 | 4918 | day month and duration will give last contact information | |
| campaign | 2.763840658 | 1 | 1 | 2 | 3 | 63 | potential outliers 4 ppls got highly contacted in this Campaign | checking the csv in this campign this ppl contacted .bank_df[bank_df['campaign'] > 50] is 4 records which means in this campaign 4 ppl contacted more than 50 times. Count for concat less than 5 is 39092 |
| pdays | 40.19782796 | -1 | -1 | -1 | -1 | 871 | maximum values are -1 , count of rows greater than 500 values are 17 | no of days last contacted is negative in some case |
| previous | 0.580323373 | 0 | 0 | 0 | 0 | 275 | no of concat performed with this client before this campaign mostly we have need to check wheather 0 is correct or it is also missing info. |
#converting object type to category type
bank_df.dtypes
#Exporting to excel file for analysis
initail_bank_df = bank_df # saving intial df to another df before applying categorical values
bank_df.head(5).to_excel("out_top5_raw.xlsx", sheet_name='bankdf')#in a xlsx file in a sheet
bank_df.head(5)
Iteration 1: $\color{red}{\text{Working on categories }}$
#list unique job types
jobs = bank_df['job'].unique()
jobs
#counts of values occuring for a particular job in dataset
bank_df['job'].value_counts()
$\color{red}{\text{Gathering knowlege on job types from domain expert(this case google :-P)}}$ White-collar work may be performed in an office or other administrative setting. ... Other types of work are those of a blue-collar worker, whose job requires manual labor and a pink-collar worker, whose labor is related to customer interaction, entertainment, sales, or other service-oriented work.
#replacing management and admin to white-collar
bank_df['job'] = bank_df['job'].replace(['management','admin.'],'white-collar')
#replacing service-oriented task(service) and housemaid to pink-collar
bank_df['job'] = bank_df['job'].replace(['services','housemaid'],'pink-collar')
bank_df['job'].value_counts()
white collar and pink collar matches the count we expected unemployed, student, retired and unknown can be written as other since they are less likely to match the expectation and can b treated as one group however unknown can be dobtful needs domain expertise on that but since the number is less so lets consider it as $\color{red}{\text{others}}$
bank_df['job'] = bank_df['job'].replace(['student','unemployed','retired','unknown'],'others')
bank_df['job'].value_counts()
More domain analysis : (from google) :The Difference Between Entrepreneurs and the Self-Employed. ... Self-Employed - Working for oneself as a freelancer or the owner of a business rather than for an employer. Entrepreneur - A person who organizes and operates a business or businesses, taking on greater than normal financial risks in order to do so
$\color{red}{\text{No we cant add the self-employed and entrepreneur this field is categorized as per as i can do :-P}}$
***checking for next column $\color{red}{\text{martial}}$
#since it is perfectly simplified no need to categorized furthur
bank_df['marital'].unique()
#checking next categorial independent variable education
bank_df['education'].unique()
#this is already categorized
#after applying category type and code primary:1 , secondary:2, tertiary:3 , unknown: 4 by alphabetical order
#checking next categorical variable default
bank_df['default'].unique()
#checking the next categorical variable housing
bank_df['housing'].unique() # ---> array(['yes', 'no'], dtype=object) --->again the above note
bank_df['loan'].unique()# ---> array(['yes', 'no'], dtype=object) --->again the above note
bank_df['contact'].unique() #---> array(['unknown', 'cellular', 'telephone'],
#dtype=object) -- cellular:0 telephone:1 unknown:2
#according to alphabetical order
bank_df['month'].unique() #--> if month has relevance from jan to dec we can convert 1 to 12
#day and month last contacted are not relevant
#dropping the columns
bank_df.drop('month', axis=1, inplace=True)
bank_df.drop('day', axis=1, inplace=True)
#checking the next categorical variables
bank_df['pdays'].unique() # contains a value -1
#lets count the values of -1 for pdays in the data
bank_df[bank_df['pdays'] == -1] # 36954 rows × 15 columns
bank_df.pdays.dtype # column is dtype('int64')
#cant delete this much rows lets replace the data -1 to a larger value say 10000
bank_df.loc[bank_df['pdays'] == -1,'pdays'] = 10000
# bank_df[bank_df['pdays'] == 10000] output : 36954 rows × 15 columns done successfully
#checking the next categorical variables
bank_df['previous'].unique()
bank_df['poutcome'].unique() # output :array(['unknown', 'failure', 'other', 'success'], dtype=object)
#unknown and other are not success nor failure (question to domain expert) replacing unknown and other as other
bank_df['poutcome'] = bank_df['poutcome'].replace(['unknown','other'],'other')
bank_df['Target'].unique() #array(['no', 'yes'], dtype=object) important note no : 0 yes : 1
#converting object dtype to categorical replacing with values/codes
for feature in bank_df.columns:
if bank_df[feature].dtype == 'object':
bank_df[feature] =pd.Categorical(bank_df[feature]).codes
#Export to excel for analysis
bank_df.to_excel("out_data_cleaning.xlsx", sheet_name='bankdfCate')#same file diffrent sheet for comparision
bank_df.head(5)
#checking for nulls
bank_df[bank_df.isnull().any(axis=1)].count()
bank_df.describe().transpose()
#Create a seperate dataframe consisting independent var (features)
bank_features_df = bank_df.drop(labels='Target', axis =1)
bank_features_df.head(5)
sns.pairplot(bank_df, hue="Target")
#from pair plot there a normalised curve forming in duration
#plotting individual plot for duration
sns.distplot(bank_df['duration'],bins=100)
#splitting tha data to training set and test set
X = bank_features_df #without target column
y = bank_df.pop("Target")
test_size = 0.30 # taking 70:30 training and test set
seed =1 # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
Decision tree without Regularisation
dt_model = DecisionTreeClassifier(criterion= 'entropy')
dt_model.fit(X_train,y_train)
#import features for DT
print(pd.DataFrame(dt_model.feature_importances_,columns=["Imp"],index =X_train.columns))
Most important feature is duration LOL, can be true .... then balance which makes sense
#Googled this method of printing decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Predicting value
#calculating model score
#Printing the confusion matrix
y_pred = dt_model.predict(X_test)
model_score = dt_model.score(X_test, y_test)
print("Model Score")
print(model_score)
print("\n ===========")
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test, y_pred))
Decision tree with Regularisation
dt_model1 = DecisionTreeClassifier(criterion= 'entropy', max_depth=5)
dt_model1.fit(X_train,y_train)
#Googled this method of printing decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dt_model1, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Predicting value
#calculating model score
#Printing the confusion matrix
y_pred = dt_model1.predict(X_test)
print("model score with training data X_test, y_test")
model_score = dt_model1.score(X_test, y_test)
print(model_score)
print("model score with test data X_test, y_pred")
model_score = dt_model1.score(X_test, y_pred)
print(model_score)
print("\n ===========")
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test, y_pred))
Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(random_state =1,n_estimators = 6)
#using random_state so that random numbers generated should be same will give the same score evry time.
rfcl = rfcl.fit(X_train, y_train)
test_pred = rfcl.predict(X_test)
rfcl.score(X_test , y_test)
Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.09, max_depth=5)
gbcl = gbcl.fit(X_train, y_train)
test_pred = rfcl.predict(X_test)
rfcl.score(X_test , y_test)
$\color{red}{\text{Results :}}$
| Models | Model Score | Comment |
|---|---|---|
| Decision Tree without Regularisation | 86% | Decision tree is huge |
| Decision Tree with Regularisation(max depth=5) | 90% | Descision tree is small as depth is 5 |
| Using Random forest Classifier(estimators =6) | 89% | Approximately equal to decision tree with regularisation |
| Using Gradient boosting(estimaters =50, learning rate=0.09, max depths=5) | 89% | Approximately equal to decision tree with regularisation and Random forest |
$\color{blue}{\text{ Questions :}}$
Three models are giving same score is there any way to diffrentiate?
In videos for decision tree :
Links referred: